######################################################
##### This file replicates Figure 2 in the paper #####
######################################################
rm(list = ls())
library(tidyverse)

### load dictionary words
count <- read.csv("../datasets/1_dictionary_count.csv")

### calculate log odds

# * 1) write a function to calculate the standardized log odds
stdLogOdds <- function(data){
   ## generate Reli and Secu matrix of word counts
   dataS <- filter(data, label == "secular")
   dataR <- filter(data, label == "religious")
   common <- dataS %>% inner_join(dataR, by = "word")
   commonS <- common %>% select(word, count.x)
   commonR <- common %>% select(word, count.y)
   S <- matrix(commonS$count.x, 1, nrow(commonS))
   R <- matrix(commonR$count.y, 1, nrow(commonR))
   colnames(S) <- commonS$word
   colnames(R) <- commonR$word
   ## calculate the weights by standardized log odds 
   a <- 1
   piS <- (colSums(S) + a) / (sum(S) + ncol(S) * a)
   piR <- (colSums(R) + a) / (sum(R) + ncol(R) * a)
   logOddsRatio <- log(piR / (1 - piR)) - log(piS / (1 - piS))
   varLogOddsRatio <- 1
   stdLogOdds <- logOddsRatio / sqrt(varLogOddsRatio)
   ## convert it into a data frame and sort the weight
   stdLogOdds_df <- as.data.frame(stdLogOdds)
   word <- common$word
   stdLogOdds_df <- stdLogOdds_df %>%
      mutate(word = word) %>%
      arrange(stdLogOdds) 
   return(stdLogOdds_df)
}
# * 2) calculate log odds
weight <- stdLogOdds(count)
count <- count %>% arrange(count)
data <- count
dataS <- filter(data, label == "secular")
dataR <- filter(data, label == "religious")
common <- dataS %>% inner_join(dataR, by = "word")
common <- merge(common, weight, by = "word")

common$count <- common$count.x + common$count.y

set.seed(123)
common <- common[sample(1:nrow(common), 150),]

pdf("../outputplots/figure2.pdf", height = 7, width = 10)
par(mar = c(2, 0.5, 0.5, 0.2))
plot(c(0,0), 
     xlim=c(-max(common$stdLogOdds)-0.6, max(common$stdLogOdds)+0.6),
     ylim=c(min(log(common$count))-0.2, max(log(common$count))+0.8),
     type="n", xaxt="n", xlab="", yaxt="n", ylab="", bty="n")
segments(0, 0, 0, max(log(common$count)),lty=3, col="grey", lwd = 3)
diff <- common$stdLogOdds/max(abs(common$stdLogOdds))
thresh <- 0
negdiff <- diff*(diff < -thresh)
posdiff <- diff*(diff > thresh)
middiff <- diff*(diff < thresh & diff > -thresh)
colors <- grDevices::rgb(posdiff+.8, .75, -negdiff+.8, maxColorValue=2)
text(common$stdLogOdds, log(common$count), 
     common$word, cex = 1.2,
     col = colors)
arrows(x0 = 0.1*max(common$stdLogOdds), y0 = min(log(common$count))-0.25,
       x1 = 0.9*max(common$stdLogOdds), y1 = min(log(common$count))-0.25,
       length = 0.25, angle = 30,
       code = 2)
text(0.75*max(common$stdLogOdds)-0.2, min(log(common$count))+0.25,
     "Religiosity", cex=1.2, pos=1, srt = 0, font = 2)
arrows(x0 = 0.1*min(common$stdLogOdds), y0 = min(log(common$count))-0.25,
       x1 = 0.9*min(common$stdLogOdds), y1 = min(log(common$count))-0.25,
       length = 0.25, angle = 30,
       code = 2)
text(0.75*min(common$stdLogOdds)+0.2, min(log(common$count))+0.25,
     "Secularism", cex=1.2, pos=1, srt = 0, font = 2)
arrows(x0 = 0.02*min(common$stdLogOdds), y0 = 0.9*max(log(common$count)),
       x1 = 0.02*min(common$stdLogOdds), y1 = max(log(common$count))+0.85,
       length = 0.25, angle = 30,
       code = 2)
text(0.1*min(common$stdLogOdds), max(log(common$count))+0.3,
     "Frequency", cex=1.2, pos=1, srt = 90, font = 2)
# x-axis
segments(-max(common$stdLogOdds), -10, max(common$stdLogOdds), -10)
mtext("Log Odds", outer = FALSE, 
      side = 1, cex=1.2, font = 2)
dev.off()

